library(stringi)
library(data.table)

st_data_path <- "~/Desktop/Temperature study/CASE-023597-Parsonnet/mira_encounters_061318.csv"
st_data_clean_path <- "~/Desktop/Temperature study/CASE-023597-Parsonnet/mira_encounters_061318_clean.csv"
st_data_diag_path <- "~/Desktop/Temperature study/CASE-023597-Parsonnet/mira_encounters_061318_diag.csv"


# This gets all the unique codes for this person, including spliting the comma-separated codes
# The argument is a list of DX values, possibly empty or comma-separated withtin each value
unique_ICD9_per_person <- function(dxValues) {
	codes <- character()
	for (v in dxValues) {
		codes <- c(codes,strsplit(as.character(v),", "))
	}
	codes <- as.character(unlist(codes)) 
	return(unique(codes[!is.na(codes)]))
}


should_reload_raw_data <- FALSE

# Set "should_reload_raw_data" to TRUE to recompute all the tables (also change the file names above if necessary)
if (should_reload_raw_data) {
	ST_data<- read.csv(st_data_path)
	
	# Remove dosage info from all medications (only keep the name of the drug)
	if (FALSE) {
	for (cc in which(substr(colnames(ST_data),1,3) == 'MED')) {
		# Use regex to remove the dosage information
		tmp <- stri_extract_first_regex(ST_data[,cc], "^[0-9 -/']*?[A-Z -]+")
		# Fix vitamin B
		vb <- stri_extract_first_regex(ST_data[,cc], "^VITAMIN B-[0-9]+")
		tmp[!is.na(vb)] <- vb[!is.na(vb)]
		# Fix PO
		po <- stri_extract_first_regex(tmp, "^.+?(?= PO)")
		tmp[!is.na(po)] <- po[!is.na(po)]
		ST_data[,cc] <- trimws(tmp)
	}
	}
	
	No <- length(ST_data$TEMPERATURE)
	
	# Shorten the DX variables to only give codes before the decimal
	dx_vars <- which(substr(colnames(ST_data),1,2) == 'DX')
	for (cc in dx_vars) {
		tmp <- stri_extract_first_regex(ST_data[,cc], "^([0-9A-Z]+)")
		ST_data[,cc] <- trimws(tmp)
	}
	
	# Create a single variable to store the primary diagnosis 
	ST_data$primary_dx <- rep(NA,No)
	primary_vars <- which(substr(colnames(ST_data),1,7) == 'PRIMARY')
	for (pp in primary_vars) {
		yy <- ST_data[,pp] == 'Y'
		ST_data$primary_dx[yy] <- as.character(ST_data[yy,pp-1])
	}
	
	# Get the indexes of DX and MED varaibles
	med_vars <- which(substr(colnames(ST_data),1,3) == 'MED')
	new_med_vars <- which(substr(colnames(ST_data),1,7) == 'NEW_MED')
	
	# Create an extra table that matches people to diagnoses
	# This table in each row has the row number (in ST_data table) of a visit, a code for the disease/medication that was present
	# The third column differentiates between primary diagnosis ("P"), secondary diagnosis ("S"), and medications ("M")
	ST_data$row_id <- 1:No
	
	ST_data_match <- list()
	
	for (pid in 1:No) {
		# Get the codes of a person at index pid
		dx_codes <- unique_ICD9_per_person(ST_data[pid,dx_vars])
		#med_codes <- unique_ICD9_per_person(ST_data[pid,med_vars])
		med_codes <- as.character(unlist(ST_data[pid,med_vars]))
		new_med_codes <- (as.character(unlist(ST_data[pid,new_med_vars])))[nchar(med_codes) > 0]
		med_codes <- med_codes[nchar(med_codes) > 0]
		rr <- rep(ST_data$row_id[pid],length(dx_codes)+length(med_codes)) # Repeat the row_id for every row 
		dd <- unlist(c(dx_codes,med_codes))
		tt <- unlist(c(rep("S",length(dx_codes)),new_med_codes))
		tt[dx_codes == ST_data$primary_dx[pid]] <- "P"
		ST_data_match[[pid]] <- data.frame(row_id = rr, dx=dd, type = tt)
	}
	
	ST_data_match <- rbindlist(ST_data_match) # This is a quick way to combine a list of data frames into a single long data frame
	ST_data_match$dx <- as.factor(ST_data_match$dx)
	ST_data_match$type <- as.factor(ST_data_match$type)
	
	# Drop all the colums with diagnoses since we do not need them anymore
	ST_data <- ST_data[,-c(dx_vars,new_med_vars,med_vars,primary_vars)]
	
	write.csv(ST_data, st_data_clean_path)
	write.csv(ST_data_match, st_data_diag_path)
} else {
	ST_data<- read.csv(st_data_clean_path)
	ST_data_match <- read.csv(st_data_diag_path)
	No <- length(ST_data$TEMPERATURE)
}

# Use the same codes for the primary diagnosis
ST_data$primary_dx <- factor(ST_data$primary_dx,levels=levels(ST_data_match$dx))
	
ST_data$ANON_ID <- as.factor (ST_data$ANON_ID)
ST_data$exam_year <- as.numeric (substr(ST_data$TIME_OF_VISIT,1,4))
ST_data$exam_month <- as.numeric (substr(ST_data$TIME_OF_VISIT,6,7))
ST_data$time_HR <- as.numeric(substr((ST_data$TIME_OF_VISIT),12,13))
ST_data$AGE <- as.numeric(ST_data$AGE)
ST_data$birth_year <- ST_data$exam_year - (ST_data$AGE/12)

ST_data_race <- read.csv("~/Desktop/Temperature study/patients_race.csv")
ST_data_race$ANON_ID <- factor(ST_data_race$ANON_ID,levels=levels(ST_data$ANON_ID))

ST_data$study_ID <- paste("ST_",ST_data$ANON_ID,sep="")

ST_data$race<- as.character(ST_data_race$CANONICAL_RACE[match(ST_data$ANON_ID,ST_data_race$ANON_ID)])
ST_data$race [ST_data$race%in% c("Native American", "Pacific Islander","Other")] <- "Other"
ST_data$race [ST_data$race%in% c("Unknown")] <- NA
ST_data$race<- factor(ST_data$race,levels = c("White", "Black","Asian" ,"Other"))


ST_data$birth_cohort <- NULL
ST_data$birth_cohort[ ST_data$birth_year <1920 &ST_data$birth_year >=1910] <-"1910s"
ST_data$birth_cohort[ ST_data$birth_year <1930 &ST_data$birth_year >=1920] <-"1920s"
ST_data$birth_cohort[ ST_data$birth_year <1940 &ST_data$birth_year >=1930] <-"1930s"
ST_data$birth_cohort[ ST_data$birth_year <1950 &ST_data$birth_year >=1940] <-"1940s"
ST_data$birth_cohort[ ST_data$birth_year <1960 &ST_data$birth_year >=1950] <-"1950s"
ST_data$birth_cohort[ ST_data$birth_year <1970 &ST_data$birth_year >=1960] <-"1960s"
ST_data$birth_cohort[ ST_data$birth_year <1980 &ST_data$birth_year >=1970] <-"1970s"
ST_data$birth_cohort[ ST_data$birth_year <1990 &ST_data$birth_year >=1980] <-"1980s"
ST_data$birth_cohort[ ST_data$birth_year <2000 &ST_data$birth_year >=1990] <-"1990s"
ST_data$birth_cohort[ ST_data$birth_year <2010 &ST_data$birth_year >=2000] <-"2000s"
ST_data$birth_cohort[ ST_data$birth_year <2020 &ST_data$birth_year >=2010] <-"2010s"

ST_data$birth_cohort <- as.factor (ST_data$birth_cohort)


ST_data$BMI_cat <-  NULL
ST_data$BMI_cat [ ST_data$BMI < 19 ] <-"Underweight"
ST_data$BMI_cat [ ST_data$BMI < 30 & ST_data$BMI >=19] <-"Normal/Overweight"
#ST_data$BMI_cat [ ST_data$BMI < 30 & ST_data$BMI >=25] <-"Overweight"
ST_data$BMI_cat [ ST_data$BMI >=30 ] <- "Obese" 
ST_data$BMI_cat <- as.factor (ST_data$BMI_cat)

ST_data$age_years <- ST_data$AGE / 12


ST_data$temp_C <- (ST_data$TEMPERATURE - 32)/1.8

unusual_dx_codes <- c("R50")
unusual_observations <- unique(ST_data_match$row_id[ST_data_match$dx %in% unusual_dx_codes])

ST_data$weight_KG <- ST_data$WEIGHT * 0.0283495
ST_data$height_CM <- sqrt(ST_data$weight_KG/ST_data$BMI) * 100

ST_data_subset <- !is.na(ST_data$TEMPERATURE) & ST_data$temp_C>35 & ST_data$temp_C< 39 & (is.na(ST_data$primary_dx) || !(ST_data$primary_dx %in% unusual_dx_codes)) & !is.na(ST_data$age_years) & ST_data$age_years>20 & ST_data$age_years<80 & ST_data$BMI>10 & ST_data$BMI<50 & !is.na(ST_data$BMI) & !is.na(ST_data$birth_year) & ST_data$GENDER!="Unknown" & !is.na(ST_data$weight_KG) & !is.na(ST_data$height_CM) & ST_data$weight_KG > 30 & ST_data$weight_KG < 200 & ST_data$height_CM > 120 & ST_data$height_CM < 220 


ST_data <-ST_data[ST_data_subset, ]


write.csv(ST_data , '~/Desktop/Temperature study/STRIDE_processed.csv')

ST_data$bmi_adj <- residuals(lm(BMI ~ height_CM, data=ST_data))
lmSTRIDE_BMI_adj <-lm(temp_C ~ age_years + height_CM  + bmi_adj, data=ST_data[ST_data$GENDER  == "Male" & ST_data$race == "White", ])

summary (lmSTRIDE_BMI_adj)

ST_data$height_norm <- ST_data$height_CM - mean(ST_data$height_CM,na.rm=T)
ST_data$weight_norm <- ST_data$weight_KG - mean(ST_data$weight_KG,na.rm=T)


lmSTRIDE_interaction <- lm(temp_C ~ age_years + time_HR + height_norm*weight_norm  ,data=ST_data[ST_data$GENDER == "Male" & ST_data$race == "White", ])


ST_data$height_norm_log <- log(ST_data$height_CM) - mean(log(ST_data$height_CM),na.rm=T)
ST_data$weight_norm_log <- ST_data$weight_KG - mean(log(ST_data$weight_KG),na.rm=T)


lmSTRIDE_interaction_log <- lm(temp_C ~ age_years + time_HR + height_norm_log*weight_norm_log  ,data=ST_data[ST_data$GENDER == "Male" & ST_data$race == "White", ])



ST_data$bmi_adj <- residuals(lm(BMI ~ height_CM, data=ST_data))

lmSTRIDE_BMI_adj <-lm(lm(temp_C ~ age_years + time_HR + height_CM  + bmi_adj, data=ST_data [ST_data$GENDER == "Male" & ST_data$race == "White", ])
